For low income and moderate income energy policy and program planning, LEAD would be a perfect dataset based on which to conduct analytics. It provides state, county and city level data including number of households at different income levels and numbers of homeowners versus renters, and breaks down based on fuel type, building type, and construction year. as well as average monthly energy expenditures and energy burden.
There are two categories of dataset
Fields' description:
For more information, please refer to https://openei.org/doe-opendata/dataset/celica-data
PyAthena is a Python DB API 2.0 (PEP 249) compliant client for the Amazon Athena JDBC driver. https://github.com/laughingman7743/PyAthena
from pyathena.connection import Connection
from pyathena.pandas_cursor import PandasCursor
AWS_REGION_NAME = "us-west-2"
DATABASE_NAME = "oedidb"
AMI68_SCC_TABLE_NAME = "lead_ami68_state_city_county"
AMI68_TRACT_TABLE_NAME = "lead_ami68_tract"
FPL15_SCC_TABLE_NAME = "lead_fpl15_state_city_county"
FPL15_TRACT_TABLE_NAME = "lead_fpl15_tract"
S3_STAGING_DIR = "s3://nrel-tests/lead-staging"
cursor = Connection(region_name=AWS_REGION_NAME, s3_staging_dir=S3_STAGING_DIR).cursor()
pandas_cursor = Connection(region_name=AWS_REGION_NAME, s3_staging_dir=S3_STAGING_DIR).cursor(PandasCursor)
Retrieve the schema
import pandas as pd
# Retrieve schema information
result = cursor.execute(f"DESCRIBE {DATABASE_NAME}.{AMI68_SCC_TABLE_NAME}")
columns = [[item.strip() for item in row[0].split("\t")] for row in result.fetchall()]
pd.DataFrame(columns, columns=["NAME", "TYPE", "FROM"])
# Retrieve parition information
result = cursor.execute(f"SHOW PARTITIONS {DATABASE_NAME}.{AMI68_SCC_TABLE_NAME}")
for row in result.fetchall():
print(row)
# Retrieve schema information
result = cursor.execute(f"DESCRIBE {DATABASE_NAME}.{AMI68_TRACT_TABLE_NAME}")
columns = [[item.strip() for item in row[0].split("\t")] for row in result.fetchall()]
pd.DataFrame(columns, columns=["NAME", "TYPE", "FROM"])
# Retrieve parition information
result = cursor.execute(f"SHOW PARTITIONS {DATABASE_NAME}.{AMI68_TRACT_TABLE_NAME}")
for row in result.fetchall():
print(row)
# Retrieve schema information
result = cursor.execute(f"DESCRIBE {DATABASE_NAME}.{FPL15_SCC_TABLE_NAME}")
columns = [[item.strip() for item in row[0].split("\t")] for row in result.fetchall()]
pd.DataFrame(columns, columns=["NAME", "TYPE", "FROM"])
# Retrieve parition information
result = cursor.execute(f"SHOW PARTITIONS {DATABASE_NAME}.{FPL15_SCC_TABLE_NAME}")
for row in result.fetchall():
print(row)
# Retrieve schema information
result = cursor.execute(f"DESCRIBE {DATABASE_NAME}.{FPL15_TRACT_TABLE_NAME}")
columns = [[item.strip() for item in row[0].split("\t")] for row in result.fetchall()]
pd.DataFrame(columns, columns=["NAME", "TYPE", "FROM"])
# Retrieve parition information
result = cursor.execute(f"SHOW PARTITIONS {DATABASE_NAME}.{FPL15_TRACT_TABLE_NAME}")
for row in result.fetchall():
print(row)
To visualize the dataset column balues on the map
import geopandas
from branca import colormap
from branca.element import Template, MacroElement
import folium
from ipywidgets import interact
STATES = ('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY')
ami68_state = pandas_cursor.execute(
f"""
SELECT abv,
SUM(units) as units,
SUM(hincp) as hincp,
SUM(elep) as elep,
SUM(elep_cal) as elep_cal,
SUM(gasp) as gasp,
SUM(gasp_cal) as gasp_cal,
SUM(gasp_u_cal) as gasp_u_cal,
SUM(fulp) as fulp,
SUM(count) as count
FROM {DATABASE_NAME}.{AMI68_SCC_TABLE_NAME}
WHERE placename IN {STATES}
GROUP BY abv
"""
).as_pandas()
#
ami68_state.head()
geo_state = geopandas.read_file("LEAD/us-states.geojson")[["id", "geometry"]]
geo_state.rename(columns={"id": "abv"}, inplace=True)
geo_ami68_state = geopandas.GeoDataFrame(
data=ami68_state.merge(geo_state, on="abv"),
geometry="geometry",
crs={"init": "epsg:4326"}
)
geo_ami68_state.head()
columns = [ 'units', 'hincp', 'elep', 'elep_cal', 'gasp', 'gasp_cal', 'gasp_u_cal', 'fulp', 'count']
@interact
def show_ami68_state_facts(column=columns):
# Display state pv system numbers on map
state_map = folium.Map(location=[39.8283, -98.5795], zoom_start=4, tiles="OpenStreetMap")
tooltip = folium.GeoJsonTooltip(
fields=["abv", column],
aliases=["state: ", column + ":"],
labels=True,
sticky=False
)
colors = ["#ffffcc", "#fed976", "#feb24c", "#fd8d3c", "#e31a1c", "#800026"]
max_value, min_value = geo_ami68_state[column].max(), geo_ami68_state[column].min()
bins = [min_value + i * (max_value - min_value) / 6 for i in range(7)]
colorscale = colormap.StepColormap(
colors=colors,
index=bins,
vmin=0,
vmax=1000000
)
def style_function(feature):
return {
"color": "#000000",
"weight": 0.2,
"opacity": 0.6,
"fillColor": colorscale(feature["properties"][column]),
"fillOpacity": 0.4,
}
folium.GeoJson(
name="AMI State Data Facts",
data=geo_ami68_state.to_json(),
tooltip=tooltip,
style_function=style_function
).add_to(state_map)
legend = MacroElement()
with open("LEAD/map_legend.html") as f:
template = f.read()
template = template.replace(">0 - 100", ">{} - {}".format(int(bins[0]), int(bins[1])))
template = template.replace(">100 - 500", ">{} - {}".format(int(bins[1]), int(bins[2])))
template = template.replace(">500 - 1000", ">{} - {}".format(int(bins[2]), int(bins[3])))
template = template.replace(">1000 - 5000", ">{} - {}".format(int(bins[3]), int(bins[4])))
template = template.replace(">5000 - 10000", ">{} - {}".format(int(bins[4]), int(bins[5])))
template = template.replace(">10000 - 50000", ">{} - {}".format(int(bins[5]), int(bins[6])))
legend._template = Template(template)
state_map.get_root().add_child(legend)
display(state_map)
column = "elep"
# Display state pv system numbers on map
state_map = folium.Map(location=[39.8283, -98.5795], zoom_start=4, tiles="OpenStreetMap")
tooltip = folium.GeoJsonTooltip(
fields=["abv", column],
aliases=["state: ", column + ":"],
labels=True,
sticky=False
)
colors = ["#ffffcc", "#fed976", "#feb24c", "#fd8d3c", "#e31a1c", "#800026"]
max_value, min_value = geo_ami68_state[column].max(), geo_ami68_state[column].min()
bins = [min_value + i * (max_value - min_value) / 6 for i in range(7)]
colorscale = colormap.StepColormap(
colors=colors,
index=bins,
vmin=0,
vmax=1000000
)
def style_function(feature):
return {
"color": "#000000",
"weight": 0.2,
"opacity": 0.6,
"fillColor": colorscale(feature["properties"][column]),
"fillOpacity": 0.4,
}
folium.GeoJson(
name="AMI State Data Facts",
data=geo_ami68_state.to_json(),
tooltip=tooltip,
style_function=style_function
).add_to(state_map)
legend = MacroElement()
with open("LEAD/map_legend.html") as f:
template = f.read()
template = template.replace(">0 - 100", ">{} - {}".format(int(bins[0]), int(bins[1])))
template = template.replace(">100 - 500", ">{} - {}".format(int(bins[1]), int(bins[2])))
template = template.replace(">500 - 1000", ">{} - {}".format(int(bins[2]), int(bins[3])))
template = template.replace(">1000 - 5000", ">{} - {}".format(int(bins[3]), int(bins[4])))
template = template.replace(">5000 - 10000", ">{} - {}".format(int(bins[4]), int(bins[5])))
template = template.replace(">10000 - 50000", ">{} - {}".format(int(bins[5]), int(bins[6])))
legend._template = Template(template)
state_map.get_root().add_child(legend)
display(state_map)
Hierarchical clustering is a general family of clustering algorithms that build nested clusters by merging or splitting them successively. For more details, please refer to https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering
Take CO tracts as an example, to find tracts with similar characteristics on energy consumption.
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
%matplotlib inline
ami68_co_tract = pandas_cursor.execute(
f"""
SELECT geo_id,
SUM(units) as units,
SUM(hincp) as hincp,
SUM(elep) as elep,
SUM(elep_cal) as elep_cal,
SUM(gasp) as gasp,
SUM(gasp_cal) as gasp_cal,
SUM(gasp_u_cal) as gasp_u_cal,
SUM(fulp) as fulp,
SUM(count) as count
FROM {DATABASE_NAME}.{AMI68_TRACT_TABLE_NAME}
WHERE abv = 'CO'
GROUP BY geo_id
"""
).as_pandas()
ami68_co_tract.head()
points = ami68_co_tract.drop(columns=["geo_id"]).to_numpy()
labels = ami68_co_tract["geo_id"].values
fig = plt.figure(figsize=(80, 600))
# create dendrogram
dendrogram = sch.dendrogram(sch.linkage(points, method="ward"), orientation="right", leaf_font_size=20, labels=labels)
# create clusters
hc = AgglomerativeClustering(n_clusters=4, affinity ="euclidean", linkage="ward")
# save clusters for chart
hc.fit_predict(points)
plt.show()
Show FPL data facts on Map
fpl15_state = pandas_cursor.execute(
f"""
SELECT abv,
SUM(units) as units,
SUM(hincp) as hincp,
SUM(elep) as elep,
SUM(elep_cal) as elep_cal,
SUM(gasp) as gasp,
SUM(gasp_cal) as gasp_cal,
SUM(gasp_u_cal) as gasp_u_cal,
SUM(fulp) as fulp,
SUM(count) as count
FROM {DATABASE_NAME}.{FPL15_SCC_TABLE_NAME}
WHERE placename IN {STATES}
GROUP BY abv
"""
).as_pandas()
#
fpl15_state.head()
geo_state = geopandas.read_file("LEAD/us-states.geojson")[["id", "geometry"]]
geo_state.rename(columns={"id": "abv"}, inplace=True)
geo_fpl15_state = geopandas.GeoDataFrame(
data=fpl15_state.merge(geo_state, on="abv"),
geometry="geometry",
crs={"init": "epsg:4326"}
)
columns = [ 'units', 'hincp', 'elep', 'elep_cal', 'gasp', 'gasp_cal', 'gasp_u_cal', 'fulp', 'count']
@interact
def show_fpl15_state_facts(column=columns):
# Display state pv system numbers on map
state_map = folium.Map(location=[39.8283, -98.5795], zoom_start=4, tiles="OpenStreetMap")
tooltip = folium.GeoJsonTooltip(
fields=["abv", column],
aliases=["state: ", column + ":"],
labels=True,
sticky=False
)
colors = ["#ffffcc", "#fed976", "#feb24c", "#fd8d3c", "#e31a1c", "#800026"]
max_value, min_value = geo_fpl15_state[column].max(), geo_fpl15_state[column].min()
bins = [min_value + i * (max_value - min_value) / 6 for i in range(7)]
colorscale = colormap.StepColormap(
colors=colors,
index=bins,
vmin=0,
vmax=1000000
)
def style_function(feature):
return {
"color": "#000000",
"weight": 0.2,
"opacity": 0.6,
"fillColor": colorscale(feature["properties"][column]),
"fillOpacity": 0.4,
}
folium.GeoJson(
name="FPL State Data Facts",
data=geo_fpl15_state.to_json(),
tooltip=tooltip,
style_function=style_function
).add_to(state_map)
legend = MacroElement()
with open("LEAD/map_legend.html") as f:
template = f.read()
template = template.replace(">0 - 100", ">{} - {}".format(int(bins[0]), int(bins[1])))
template = template.replace(">100 - 500", ">{} - {}".format(int(bins[1]), int(bins[2])))
template = template.replace(">500 - 1000", ">{} - {}".format(int(bins[2]), int(bins[3])))
template = template.replace(">1000 - 5000", ">{} - {}".format(int(bins[3]), int(bins[4])))
template = template.replace(">5000 - 10000", ">{} - {}".format(int(bins[4]), int(bins[5])))
template = template.replace(">10000 - 50000", ">{} - {}".format(int(bins[5]), int(bins[6])))
legend._template = Template(template)
state_map.get_root().add_child(legend)
display(state_map)
column = "elep"
# Display state pv system numbers on map
state_map = folium.Map(location=[39.8283, -98.5795], zoom_start=4, tiles="OpenStreetMap")
tooltip = folium.GeoJsonTooltip(
fields=["abv", column],
aliases=["state: ", column + ":"],
labels=True,
sticky=False
)
colors = ["#ffffcc", "#fed976", "#feb24c", "#fd8d3c", "#e31a1c", "#800026"]
max_value, min_value = geo_fpl15_state[column].max(), geo_fpl15_state[column].min()
bins = [min_value + i * (max_value - min_value) / 6 for i in range(7)]
colorscale = colormap.StepColormap(
colors=colors,
index=bins,
vmin=0,
vmax=1000000
)
def style_function(feature):
return {
"color": "#000000",
"weight": 0.2,
"opacity": 0.6,
"fillColor": colorscale(feature["properties"][column]),
"fillOpacity": 0.4,
}
folium.GeoJson(
name="FPL State Data Facts",
data=geo_fpl15_state.to_json(),
tooltip=tooltip,
style_function=style_function
).add_to(state_map)
legend = MacroElement()
with open("LEAD/map_legend.html") as f:
template = f.read()
template = template.replace(">0 - 100", ">{} - {}".format(int(bins[0]), int(bins[1])))
template = template.replace(">100 - 500", ">{} - {}".format(int(bins[1]), int(bins[2])))
template = template.replace(">500 - 1000", ">{} - {}".format(int(bins[2]), int(bins[3])))
template = template.replace(">1000 - 5000", ">{} - {}".format(int(bins[3]), int(bins[4])))
template = template.replace(">5000 - 10000", ">{} - {}".format(int(bins[4]), int(bins[5])))
template = template.replace(">10000 - 50000", ">{} - {}".format(int(bins[5]), int(bins[6])))
legend._template = Template(template)
state_map.get_root().add_child(legend)
state_map
Spatial Autocorrelation - Attribute Similarity Measurements.
Take CO tracts as an example (Notes - there are some multipolygons which were removed in this analysis example.)
import mapclassify as mc
import pysal
from pysal.lib import weights
from pysal.viz.splot import libpysal
fpl15_co_tract = pandas_cursor.execute(
f"""
SELECT geo_id,
SUM(units) as units,
SUM(hincp) as hincp,
SUM(elep) as elep,
SUM(elep_cal) as elep_cal,
SUM(gasp) as gasp,
SUM(gasp_cal) as gasp_cal,
SUM(gasp_u_cal) as gasp_u_cal,
SUM(fulp) as fulp,
SUM(count) as count
FROM {DATABASE_NAME}.{FPL15_TRACT_TABLE_NAME}
WHERE abv = 'CO'
GROUP BY geo_id
"""
).as_pandas()
fpl15_co_tract.loc[:, "geo_id"] = ["0"+geo_id for geo_id in fpl15_co_tract["geo_id"].values]
fpl15_co_tract.head()
geo_co_tract = geopandas.read_file("LEAD/co_tracts.geojson")
geo_co_tract.rename(columns={"FIPS": "geo_id"}, inplace=True)
geo_co_tract.drop(columns=["OBJECTID"], inplace=True)
fpl15_geo_co_tract = geopandas.GeoDataFrame(
data=fpl15_co_tract.merge(geo_co_tract, on="geo_id"),
geometry="geometry",
crs={"init": "epsg:4326"}
)
fpl15_geo_co_tract.head()
w = weights.contiguity.Queen.from_dataframe(fpl15_geo_co_tract)
w.transform = "r"
libpysal.plot_spatial_weights(w, fpl15_geo_co_tract, figsize=(30, 30))
y = fpl15_geo_co_tract["elep"]
ylag = weights.lag_spatial(w, y)
ylagq5 = mc.Quantiles(ylag, k=5)
fig, ax = plt.subplots(1, figsize=(30, 30))
fpl15_geo_co_tract.assign(cl=ylagq5.yb).plot(column="cl", categorical=True, k=5, cmap="GnBu", linewidth=0.1, ax=ax, edgecolor="white", legend=True)
ax.set_axis_off()
plt.title("Spatial Lag Median ELEP Price (Quintiles)")
plt.show()